# Dependencies
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(bizdays)
##
## Attaching package: 'bizdays'
## The following object is masked from 'package:stats':
##
## offset
library(stringr)
# Create a calendar that lists federal holidays
bizdays::load_builtin_calendars()
# Using Brazil/ANBIMA because ANBIMA is no longer functional: https://github.com/msperlin/GetTDData/issues/10
# The difference seems to be trivial, as they seem to be the same
cal = bizdays::calendars()[["Brazil/ANBIMA"]]
# 57 features -> 31 features
JPM_balance_sheet = read_csv("../raw_data/Original/BalanceSheetJPM.csv")
## New names:
## Rows: 57 Columns: 72
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (35): ...1, 30 SEP '25, 31 DEC '23, 31 DEC '22, 31 DEC '21, 30 JUN '21, ... num
## (37): 30 JUN '25, 31 MAR '25, 31 DEC '24, 30 SEP '24, 30 JUN '24, 31 MAR...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
JPM_balance_sheet = na.omit(JPM_balance_sheet)
JPM_balance_sheet = JPM_balance_sheet %>% mutate(across(-1, as.character)) %>% pivot_longer(
cols = -1,
names_to = "Year_Quarter",
values_to = "value"
) %>%
pivot_wider(
names_from = 1,
values_from = value
) %>%
mutate(Year_Quarter = case_when(
grepl("SEP", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q3"),
grepl("JUN", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q2"),
grepl("MAR", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q1"),
grepl("DEC", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q4"),
TRUE ~ Year_Quarter
))
JPM_balance_sheet
# YTD
# 73 features -> 41 features
JPM_cash_flow = read_csv("../raw_data/Original/CashFlowJPM.csv")
## New names:
## Rows: 73 Columns: 75
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (75): ...1, 30 JUN '25, 31 MAR '25, 31 DEC '24, 30 SEP '24, 30 JUN '24, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
JPM_cash_flow = na.omit(JPM_cash_flow)
JPM_cash_flow = JPM_cash_flow %>% mutate(across(-1, as.character)) %>% pivot_longer(
cols = -1,
names_to = "Year_Quarter",
values_to = "value") %>% pivot_wider(
names_from = 1,
values_from = value,
values_fn = first
) %>%
mutate(Year_Quarter = case_when(
grepl("SEP", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q3"),
grepl("JUN", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q2"),
grepl("MAR", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q1"),
grepl("DEC", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q4"),
TRUE ~ Year_Quarter
)) %>% mutate(across(!Year_Quarter, ~ {
. %>%
str_replace_all("\\.", "") %>% # Remove thousands separator (period)
str_replace(",", ".") %>% # Replace decimal comma with period
as.numeric()
}))
JPM_cash_flow = JPM_cash_flow %>% arrange(Year_Quarter) %>% slice(5:n()) %>% mutate(
Year = as.numeric(substr(Year_Quarter, 1, 4))
) %>% group_by(Year) %>% mutate(
across(
where(is.numeric),
~ if_else(
row_number() == 1, # if it's Q1 (first in group)
.x, # keep original YTD value
.x - lag(.x) # otherwise, subtract previous quarter
))
) %>% ungroup() %>% select(!Year)
JPM_cash_flow
# No features lost
FED_funds = read_csv("../raw_data/Original/FEDFUNDS.csv")
## Rows: 856 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): FEDFUNDS
## date (1): observation_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
FED_funds = na.omit(FED_funds)
FED_funds = FED_funds %>%
mutate(Year_Quarter = paste0(year(observation_date), "-Q", quarter(observation_date))) %>%
select(Year_Quarter, FEDFUNDS) %>%
group_by(Year_Quarter) %>%
summarise(FEDFUNDS = mean(FEDFUNDS, na.rm = TRUE)) %>%
ungroup()
FED_funds
# YTD
# 53 features -> 31 features
JPM_income = read_csv("../raw_data/Original/IncomeStatementJPM.csv")
## New names:
## Rows: 53 Columns: 76
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 num (75): 30 SEP '25, 30 JUN '25, 31 MAR '25, 31 DEC '24, 30 SEP '24,
## 30 JUN...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
JPM_income = na.omit(JPM_income)
JPM_income = JPM_income %>% mutate(across(-1, as.character)) %>% pivot_longer(
cols = -1,
names_to = "Year_Quarter",
values_to = "value") %>% pivot_wider(
names_from = 1,
values_from = value,
values_fn = first
) %>%
mutate(Year_Quarter = case_when(
grepl("SEP", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q3"),
grepl("JUN", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q2"),
grepl("MAR", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q1"),
grepl("DEC", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q4"),
TRUE ~ Year_Quarter
))
JPM_income = JPM_income %>% mutate(across(!Year_Quarter, ~ {
. %>%
str_replace_all("\\.", "") %>% # Remove thousands separator (period)
str_replace(",", ".") %>% # Replace decimal comma with period
as.numeric()
}))
JPM_income = JPM_income %>% arrange(Year_Quarter) %>% slice(5:n()) %>% mutate(
Year = as.numeric(substr(Year_Quarter, 1, 4))
) %>% group_by(Year) %>% mutate(
across(
where(is.numeric),
~ if_else(
row_number() == 1, # if it's Q1 (first in group)
.x, # keep original YTD value
.x - lag(.x) # otherwise, subtract previous quarter
))
) %>% ungroup() %>% select(!Year)
JPM_income
# 57 features -> 31 features
JPM_balance_sheet = read_csv("../raw_data/Original/BalanceSheetJPM.csv")
## New names:
## Rows: 57 Columns: 72
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (35): ...1, 30 SEP '25, 31 DEC '23, 31 DEC '22, 31 DEC '21, 30 JUN '21, ... num
## (37): 30 JUN '25, 31 MAR '25, 31 DEC '24, 30 SEP '24, 30 JUN '24, 31 MAR...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
JPM_balance_sheet = na.omit(JPM_balance_sheet)
JPM_balance_sheet = JPM_balance_sheet %>% mutate(across(-1, as.character)) %>% pivot_longer(
cols = -1,
names_to = "Year_Quarter",
values_to = "value"
) %>%
pivot_wider(
names_from = 1,
values_from = value
) %>%
mutate(Year_Quarter = case_when(
grepl("SEP", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q3"),
grepl("JUN", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q2"),
grepl("MAR", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q1"),
grepl("DEC", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q4"),
TRUE ~ Year_Quarter
))
JPM_balance_sheet
JPM_ratios = read_csv("../raw_data/Original/RatiosJPM.csv")
## New names:
## Rows: 60 Columns: 101
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (96): ...1, SEP '25, JUN '25, MAR '25, DEC '24, SEP '24, JUN '24, MAR '2... num
## (5): SEP '06, JUN '06, MAR '06, DEC '05, SEP '05
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
JPM_ratios = na.omit(JPM_ratios)
JPM_ratios = JPM_ratios %>% mutate(across(-1, as.character)) %>% pivot_longer(
cols = -1,
names_to = "Year_Quarter",
values_to = "value"
) %>%
pivot_wider(
names_from = 1,
values_from = value
) %>%
mutate(Year_Quarter = case_when(
grepl("SEP", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q3"),
grepl("JUN", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q2"),
grepl("MAR", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q1"),
grepl("DEC", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q4"),
TRUE ~ Year_Quarter
))
JPM_key = read_csv("../raw_data/Original/KeyItemsJPM_Interpolated.csv")
## New names:
## Rows: 34 Columns: 71
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (71): ...1, SEP '25, JUN '25, MAR '25, DEC '24, SEP '24, JUN '24, MAR '2...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
JPM_key = na.omit(JPM_key)
JPM_key = JPM_key %>% mutate(across(-1, as.character)) %>% pivot_longer(
cols = -1,
names_to = "Year_Quarter",
values_to = "value"
) %>%
pivot_wider(
names_from = 1,
values_from = value
) %>%
mutate(Year_Quarter = case_when(
grepl("SEP", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q3"),
grepl("JUN", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q2"),
grepl("MAR", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q1"),
grepl("DEC", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q4"),
TRUE ~ Year_Quarter
))
HYBond = read_csv("../raw_data/Original/HYBond_interpolated.csv")
## Rows: 7642 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): BAMLH0A0HYM2EY
## date (1): observation_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
HYBond = na.omit(HYBond)
HYBond = HYBond %>% rename(Date = observation_date, HY_bond_rate = BAMLH0A0HYM2EY)
HYBond
IgBond = read_csv("../raw_data/Original/IgBond.csv")
## Rows: 7642 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): BAMLC0A4CBBBEY
## date (1): observation_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
IgBond = na.omit(IgBond)
IgBond = IgBond %>% rename(Date = observation_date, IG_bond_rate = BAMLC0A4CBBBEY)
IgBond
JPM = read_csv("../raw_data/Original/PriceHistoryJPM.csv")
## Rows: 6501 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Date
## dbl (10): Price, CVol, Change, % Change, % Return, Total Return (Gross), Ope...
## num (1): Cumulative Return %
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fill in dates to identify quarters
JPM = JPM %>% mutate(Date = offset(as.Date("2025-11-02"), -(0:(n()-1)), cal), Year_Quarter = paste0(year(Date), "-Q", quarter(Date)))
JPM = na.omit(JPM %>% select(Date, Year_Quarter, everything())) # Remove rows with NA values (No value generation for now)
# Merge the datasets
JPM = JPM %>%
inner_join(JPM_balance_sheet, by = "Year_Quarter") %>%
inner_join(JPM_cash_flow, by = "Year_Quarter") %>%
inner_join(FED_funds, by = "Year_Quarter") %>%
inner_join(JPM_income, by = "Year_Quarter") %>%
inner_join(JPM_ratios, by = "Year_Quarter") %>%
inner_join(JPM_key, by = "Year_Quarter") %>%
inner_join(HYBond, by="Date") %>%
inner_join(IgBond, by="Date")
JPM = na.omit(JPM)
JPM <- JPM %>%
arrange(Date) %>%
mutate(
# change from yesterday → today
Percent_change_backward = (Price - lag(Price)) / lag(Price) * 100,
# change from today → tomorrow
Percent_change_forward = (lead(Price) - Price) / Price * 100
)
JPM = JPM %>% select(Date, Year_Quarter, Price, Percent_change_backward, Percent_change_forward, everything()) %>% drop_na()
JPM
bin_return = function(x) {
case_when(
x < -10 ~ 1,
x >= -10 & x < -9 ~ 2,
x >= -9 & x < -8 ~ 3,
x >= -8 & x < -7 ~ 4,
x >= -7 & x < -6 ~ 5,
x >= -6 & x < -5 ~ 6,
x >= -5 & x < -4.5 ~ 7,
x >= -4.5 & x < -4 ~ 8,
x >= -4 & x < -3.5 ~ 9,
x >= -3.5 & x < -3 ~ 10,
x >= -3 & x < -2.5 ~ 11,
x >= -2.5 & x < -2.05 ~ 12,
x >= -2.05 & x < -1.85 ~ 13,
x >= -1.85 & x < -1.65 ~ 14,
x >= -1.65 & x < -1.45 ~ 15,
x >= -1.45 & x < -1.25 ~ 16,
x >= -1.25 & x < -1.05 ~ 17,
x >= -1.05 & x < -0.95 ~ 18,
x >= -0.95 & x < -0.85 ~ 19,
x >= -0.85 & x < -0.75 ~ 20,
x >= -0.75 & x < -0.65 ~ 21,
x >= -0.65 & x < -0.55 ~ 22,
x >= -0.55 & x < -0.45 ~ 23,
x >= -0.45 & x < -0.35 ~ 24,
x >= -0.35 & x < -0.25 ~ 25,
x >= -0.25 & x < -0.15 ~ 26,
x >= -0.15 & x < -0.05 ~ 27,
x >= -0.05 & x < 0.05 ~ 28,
x >= 0.05 & x < 0.15 ~ 29,
x >= 0.15 & x < 0.25 ~ 30,
x >= 0.25 & x < 0.35 ~ 31,
x >= 0.35 & x < 0.45 ~ 32,
x >= 0.45 & x < 0.55 ~ 33,
x >= 0.55 & x < 0.65 ~ 34,
x >= 0.65 & x < 0.75 ~ 35,
x >= 0.75 & x < 0.85 ~ 36,
x >= 0.85 & x < 0.95 ~ 37,
x >= 0.95 & x < 1.05 ~ 38,
x >= 1.05 & x < 1.25 ~ 39,
x >= 1.25 & x < 1.45 ~ 40,
x >= 1.45 & x < 1.65 ~ 41,
x >= 1.65 & x < 1.85 ~ 42,
x >= 1.85 & x < 2.05 ~ 43,
x >= 2.05 & x < 2.55 ~ 44,
x >= 2.55 & x < 3.05 ~ 45,
x >= 3.05 & x < 3.55 ~ 46,
x >= 3.55 & x < 4.05 ~ 47,
x >= 4.05 & x < 4.55 ~ 48,
x >= 4.55 & x < 5 ~ 49,
x >= 5 & x < 6 ~ 50,
x >= 6 & x < 7 ~ 51,
x >= 7 & x < 8 ~ 52,
x >= 8 & x < 9 ~ 53,
x >= 9 & x < 10 ~ 54,
x > 10 ~ 55,
TRUE ~ NA_integer_
)
}
JPM = JPM %>% mutate(
Backward_Bin = bin_return(Percent_change_backward),
Forward_Bin = bin_return(Percent_change_forward)
)
label = JPM %>% select(Forward_Bin)
JPM <- JPM %>%
select(
Date,
Year_Quarter,
Price,
Percent_change_backward,
Backward_Bin,
everything(),
-Forward_Bin,
-Percent_change_forward
)
JPM
# Remove European commas and convert everything to numeric decimal, remove Date and Quarter features
JPM <- JPM %>%
select(-Year_Quarter, -Date) %>%
mutate(across(where(is.character), ~ {
. %>%
str_replace_all("\\.", "") %>% # Remove thousands separator (period)
str_replace(",", ".") %>% # Replace decimal comma with period
as.numeric()
}))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(...)`.
## Caused by warning in `Opinion %>% str_replace_all("\\.", "") %>% str_replace(",", ".") %>% as.numeric()`:
## ! NAs introduced by coercion
JPM
# Final datasets
JPM= JPM %>% mutate(index = row_number()) %>% select(index, everything())
JPM
write.csv(JPM, "train.csv", row.names = FALSE)
write.csv(label, "label.csv", row.names = FALSE)